/*******************************************************************************
																				
	DESCRIPTION: 	This do file creates the dataset with SILC variables.
					
*******************************************************************************/

clear all
global id_code 001_11

************************************************
* A1. SILC data - combine all years
************************************************

use "${SILC}/silc_2006", clear

forvalues year = 2007/2019{
	append using "${SILC}/silc_`year'"
}

************************************************
* A2: SILC data - clean and rename variables
************************************************

* Rename and keep relevant variables
rename HB010 year
rename HS060 capacityFinExp
rename PH010 genHealth
rename PH020 LTIllnessSILC
rename PH030 limitActiv
rename PW010 lifeSatisf
rename PW020 meaningLife
rename PW050 beingNervous
rename PW060 feelingDown
rename PW080 feelingDepre

keep LopNr_PersonNr year capacityFinExp genHealth LTIllnessSILC limitActiv lifeSatisf meaningLife beingNervous feelingDown feelingDepre

* Removing the duplicates
duplicates drop // 4 observations deleted
duplicates report Lop* year // surplus of 11
bys Lop* year: gen dup=cond(_N==1,0,_n)

* Two pairs of duplicates have differ in terms of capacityFinExp and have other variables missing
drop if (LopNr_PersonNr== 1704900 | LopNr_PersonNr==9519937) & year == 2006 // 4 observations dropped

* In the remaining duplicate pairs one observation has most variables missing, we drop that observation
drop if genHealth==. & LTIllnessSILC==. & limitActiv==. & lifeSatisf==. & meaningLife==. & beingNervous==. & feelingDown==. & feelingDepre==. & dup>0 // 9 observations dropped
drop dup

/* Create observations for all years for each person in PES dataset 
to ensure that we don't unnecessarily drop observations when taking a lag 
(i.e., when the observation appears only in year before unemployment but not in the year of unemployment) */

preserve
use "${data}/001_1_UnemploymentSpells.dta", clear
keep Lop*
duplicates drop
expand 14
bysort Lop*: gen year = _n + 2005

tempfile temp
save `temp'
restore

merge 1:1 LopNr_PersonNr year using `temp'
drop if _merge == 1
drop _merge

save "${data}/${id_code}_SILCData.dta", replace

* Generate lags of variables

xtset Lop* year 
foreach var in capacityFinExp genHealth LTIllnessSILC limitActiv lifeSatisf meaningLife beingNervous feelingDown feelingDepre {
	gen L_`var' = L.`var'
}

save "${data}/${id_code}_SILCData.dta", replace

* Principal component analysis

use "${data}/${id_code}_SILCData.dta", clear

pca lifeSatisf meaningLife beingNervous feelingDown feelingDepre

predict pc1 pc2 pc3 pc4 pc5, score

pca genHealth LTIllnessSILC limitActiv

predict HealthPC1, score

save "${data}/${id_code}_SILCData.dta", replace
